PyTorch Multi-layer Perceptron (MLP) for Multi-Class Classification

Dataset

a random n-class classification dataset can be generated using sklearn.datasets.make_classification. Here, we generate a dataset with two features and 1000 instances. Moreover, the dataset is generated for multiclass classification with five classes.

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from num2words import num2words

n_features =2
n_classes = 4
X, y = make_classification(n_samples = int((n_classes-1)*1e3), n_features = n_features, n_redundant=0, n_classes = n_classes,
                           n_informative=2, random_state=1, n_clusters_per_class=1)
Labels_dict = dict(zip(list(np.unique(y)), [num2words(x).title() for x in np.unique(y)]))

Data = pd.DataFrame(data = X, columns = ['Feature %i' % (i+1) for i in range(n_features)])
Target = 'Outcome Variable'
Data[Target] = y
display(Data)

from HD_DeepLearning import Plot_Data
    
PD = dict(BP = .5, alpha=.7, bg_alpha = 0.25, grid = True, cricle_size = 50,
          FigSize = 7, h=0.02, pad=1, ColorMap =  'Set1', Labels = list(Labels_dict.values()))

Plot_Data(X, y, PD = PD, Labels_dict = Labels_dict, ax = None)
Feature 1 Feature 2 Outcome Variable
0 -1.689380 1.233636 2
1 -1.357824 1.236826 2
2 1.104089 1.308663 3
3 -1.395940 -0.465010 0
4 -0.221240 -2.495895 0
... ... ... ...
2995 -1.457264 1.179276 2
2996 0.998499 1.209994 3
2997 -0.301584 -2.114714 0
2998 1.111228 0.011635 3
2999 -1.316420 -0.448300 0

3000 rows × 3 columns

Train and Test Sets

In [2]:
Pull = [.01 for x in range((len(Labels_dict)-1))]
Pull.append(.1)

import plotly.express as px
from HD_DeepLearning import DatasetTargetDist
PD = dict(PieColors = px.colors.sequential.deep, TableColors = ['Navy','White'], hole = .4,
          row_heights=[0.35, 0.65],textfont = 14, height = 500, tablecolumnwidth = [0.25, 0.15, 0.15],
          pull = Pull, legend_title = Target, title_x = 0.5, title_y = .9, pie_legend = [0.1, 0.12])
del Pull
DatasetTargetDist(Data, Target, Labels_dict, PD)

StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    # X
    if isinstance(X, pd.DataFrame):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
    else:
        X_train, X_test = X[train_index], X[test_index]
    # y    
    if isinstance(y, pd.Series):
        y_train, y_test = y[train_index], y[test_index]
    else:
        y_train, y_test = y[train_index], y[test_index]
del sss

from HD_DeepLearning import Train_Test_Dist
PD.update(dict(column_widths=[0.3, 0.3, 0.3], tablecolumnwidth = [0.2, 0.4], height = 550, legend_title = Target))

Train_Test_Dist(X_train, y_train, X_test, y_test, PD, Labels_dict)

Modeling: PyTorch Multi-layer Perceptron (MLP) for Multi-Class Classification

A multi-layer perceptron (MLP) is a class of feedforward artificial neural network (ANN). The algorithm at each iteration uses the Cross-Entropy Loss to measure the loss, and then the gradient and the model update is calculated. At the end of this iterative process, we would reach a better level of agreement between test and predicted sets since the error would be lower from that of the first step.

In [4]:
import torch

def TorchSets(Set):
    # Inut: Arrays
    # GPU Cuda
    if isinstance(Set, (pd.DataFrame, pd.Series)):
        Set = Set.values
    if torch.cuda.is_available():
        if Set.ndim==1:
            Out = torch.autograd.Variable(torch.from_numpy(Set).type(torch.LongTensor).cuda())
        else:
            Out = torch.autograd.Variable(torch.from_numpy(Set).cuda())
    # CPU
    else:
        if Set.ndim==1:
            Out = torch.autograd.Variable(torch.from_numpy(Set).type(torch.LongTensor))
        else:
            Out = torch.autograd.Variable(torch.from_numpy(Set))
    return Out

# Tensors
X_train_tensor = TorchSets(X_train)
y_train_tensor = TorchSets(y_train)
X_test_tensor = TorchSets(X_test)
y_test_tensor = TorchSets(y_test)
    
Batch_size = 100
iteration_number = int(2e2)

epochs_number = int(iteration_number / (len(X_train) / Batch_size))

# Pytorch train and test sets
Train_set = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
Test_set = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)

# data loader
train_loader = torch.utils.data.DataLoader(Train_set, batch_size = Batch_size, shuffle = False)
test_loader = torch.utils.data.DataLoader(Train_set, batch_size = Batch_size, shuffle = False)
In [5]:
class MLP_Model(torch.nn.Module):
    '''
    A MLP model with two hidden layers
    '''
    def __init__(self, input_Size, hidden_Size, output_Size):
        super(MLP_Model, self).__init__()
        
        # Input Layer to the 1st Layer:
        self.fc1 = torch.nn.Linear(input_Size, hidden_Size) 
        torch.nn.init.kaiming_uniform_(self.fc1.weight, nonlinearity='relu')
        self.act1 = torch.nn.ReLU()
        
        # 1st Layer to 2nd Layer
        self.fc2 = torch.nn.Linear(hidden_Size, int(hidden_Size/4))
        torch.nn.init.kaiming_uniform_(self.fc2.weight, nonlinearity='relu')
        self.act2 = torch.nn.ReLU()
        
        # 2nd layer to Output Layer
        self.fc3 = torch.nn.Linear(int(hidden_Size/4), output_Size)
        torch.nn.init.kaiming_uniform_(self.fc3.weight)
        self.act3 = torch.nn.Softmax(dim=1)
    
    def forward(self, x):
        # Input Layer to the 1st Layer:
        out = self.fc1(x)
        # Non-linearity 1
        out = self.act1(out)
        
        # 1st Layer to 2nd Layer
        out = self.fc2(out)
        out = self.act2(out)
               
        # 2nd layer to Output Layer
        out = self.fc3(out)
        out = self.act3(out)
        return out

Fitting the model

In [6]:
input_Size, output_Size = n_features, len(Labels_dict)
hidden_Size = 256

# model
model = MLP_Model(input_Size, hidden_Size, output_Size)

# GPU
if torch.cuda.is_available():
    model.cuda()

# Cross Entropy Loss 
criterion= torch.nn.CrossEntropyLoss()

# Optimizer 
optimizer = torch.optim.SGD(model.parameters(), lr= 1e-2, momentum=.9)

# Traning the Model
Count = 0
Loss_list = []
Iteration_list = []
Accuracy_list = []
MSE_list = []
MAE_list = []
Steps = 10

import progressbar
Progress_Bar = progressbar.ProgressBar(maxval= iteration_number + 200,
                                       widgets=[progressbar.Bar('=', '|', '|'),
                                                progressbar.Percentage()])
    
for epoch in range(epochs_number):
    for i, (Xtr, ytr) in enumerate(train_loader):
        
        # Variables
        Xtr = torch.autograd.Variable(Xtr.view(-1, n_features))
        ytr = torch.autograd.Variable(ytr)
        
        # Set all gradients to zero
        optimizer.zero_grad()
        
        # Forward
        Out = model(Xtr.float())
        
        # loss
        loss = criterion(Out, ytr.long())
        
        # Backward (Calculating the gradients)
        loss.backward()
        
        # Update parameters
        optimizer.step()
        
        Count += 1
        
        del Xtr, ytr
        
        # Predictions
        if Count % Steps == 0:
            # Calculate Accuracy         
            Correct, Total = 0, 0
            # Predictions
            for Xts, yts in test_loader: 
                Xts = torch.autograd.Variable(Xts.view(-1, n_features))
                
                # Forward
                Out = model(Xts.float())
                
                # The maximum value of Out
                Predicted = torch.max(Out.data, 1)[1]
                
                # Total number of yts
                Total += len(yts)
                
                # Total Correct predictions
                Correct += (Predicted == yts).sum()
            del Xts, yts
            # storing loss and iteration
            Loss_list.append(loss.data)
            Iteration_list.append(Count)
            Accuracy_list.append(Correct / float(Total))
            
        Progress_Bar.update(Count)

Progress_Bar.finish()

history = pd.DataFrame({'Iteration': np.array(Iteration_list),
                      'Loss': np.array([x.cpu().data.numpy() for x in Loss_list]),
                      'Accuracy': np.array([x.cpu().data.numpy() for x in Accuracy_list])})
del Loss_list, Iteration_list, Accuracy_list
|=========================================================================|100%

Model Performance

In [7]:
from HD_DeepLearning import Plot_history
PD = dict(row_heights = [0.4, 0.6], lw = 1.5, font_size=12, height = 700, yLim = 1.5,
          th_line_color = 'Navy', th_fill_color='darkslategray', table_columnwidth = [0.4, 0.4, 0.4, 0.4],
          tc_line_color = 'Navy', tc_fill_color = None, title_x = 0.46, title_y = 0.92, tb_cell_heigh = 20,
          Number_Format = '%.4e')

Plot_history(history, PD, Title = 'Test Set', Colors = ['DarkGreen', 'Red'])
In [8]:
from HD_DeepLearning import Plot_Classification_Torch
import matplotlib.pyplot as plt

PD = dict(BP = .5, alpha=.7, bg_alpha = 0.15, grid = False, cricle_size = 50,
          FigSize = 7, h=0.02, pad=1, ColorMap =  'Set1', Labels = list(Labels_dict.values()))

fig, ax = plt.subplots(1, 2, figsize=(16, 7))
# Train Set
Plot_Classification(model, X_train, y_train.argmax(axis = 1), PD = PD, ax = ax[0])
_ = ax[0].set_title('Train Set', fontsize = 16, weight='bold')
# Test Set
Plot_Classification(model, X_test, y_test.argmax(axis = 1), PD = PD, ax = ax[1])
_ = ax[1].set_title('Test Set', fontsize = 16, weight='bold')

Confusion Matrix

The confusion matrix allows for visualization of the performance of an algorithm. Note that due to the size of data, here we don't provide a Cross-validation evaluation. In general, this type of evaluation is preferred.

In [10]:
from sklearn import metrics

# Train
y_pred = model(X_train_tensor.float())
y_pred = torch.max(y_pred.data, 1)[1]
y_pred = y_pred.cpu().data.numpy()
Reports_Train = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=list(Labels_dict.values()),
                                                           output_dict=True)).T
CM_Train = metrics.confusion_matrix(y_train, y_pred)
# Test
y_pred = model(X_test_tensor.float())
y_pred = torch.max(y_pred.data, 1)[1]
y_pred = y_pred.cpu().data.numpy()
Reports_Test = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=list(Labels_dict.values()),
                                                          output_dict=True)).T
CM_Test = metrics.confusion_matrix(y_test, y_pred)

Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set'})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set'})
                                                 
display(Reports_Train.style.hide(axis='index').set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
        set_properties(subset=['Train Set'], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide(axis='index').set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
        set_properties(subset=['Test Set'], **{'background-color': 'RoyalBlue', 'color': 'White'}))

from HD_DeepLearning import Confusion_Mat
PD = dict(FS = (14, 6), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
Train Set precision recall f1-score support
Zero 0.893048 0.957935 0.924354 523.000000
One 0.813688 0.812144 0.812915 527.000000
Two 0.969758 0.914449 0.941292 526.000000
Three 0.874275 0.862595 0.868396 524.000000
accuracy 0.886667 0.886667 0.886667 0.886667
macro avg 0.887692 0.886781 0.886739 2100.000000
weighted avg 0.887662 0.886667 0.886668 2100.000000
Test Set precision recall f1-score support
Zero 0.900415 0.968750 0.933333 224.000000
One 0.792793 0.778761 0.785714 226.000000
Two 0.967136 0.915556 0.940639 225.000000
Three 0.839286 0.835556 0.837416 225.000000
accuracy 0.874444 0.874444 0.874444 0.874444
macro avg 0.874907 0.874656 0.874276 900.000000
weighted avg 0.874788 0.874444 0.874112 900.000000

Refrences

  1. Stathakis, D. (2009). How many hidden layers and nodes?. International Journal of Remote Sensing, 30(8), 2133-2147.
  2. Artificial neural network. Retrieved June 02, 2020, from https://en.wikipedia.org/wiki/Artificial_neural_network.